movies1 <- movies[!is.na(movies$budget),] ggplot(movies1, aes(x=budget, y=rating)) + geom_point()
ggplot(movies1, aes(x=budget, y=rating)) + geom_point(shape=5, size=3)
ggplot2This usually happens accidentally.
ggplot(movies1, aes(x=budget, y=rating, color=Action)) + geom_point()
ggplot(movies1, aes(x=budget, y=rating, color=factor(Action))) +
geom_point() + labs(color='Action Movie?')
ggplot(movies1, aes(x=budget, y=rating, color=factor(Action), shape=(length > 120))) +
geom_point(size=3) + labs(color='Action Movie?')
ggplot(movies, aes(x=votes, y=rating)) + geom_point() +
stat_smooth(method=lm, level=.95) # add se=FALSE to disable CI
ggplot(movies, aes(x=votes, y=rating)) + geom_point() + stat_smooth()
ggplot(movies1, aes(x=budget, y=rating, color=factor(Action))) +
geom_point() + labs(color='Action Movie?') + stat_smooth(method=lm, se=FALSE)
lm_model <- lm(rating ~ budget, data=movies1)
ggplot(movies1, aes(x=budget, y=rating)) + geom_point() +
geom_line(aes(x=budget, y=lm_model$fitted.values), color='blue')
geom_textstarmovies <- movies[movies$votes > mean(movies$votes),] starmovies <- starmovies[order(-starmovies$rating),][1:10,] ggplot(starmovies, aes(x=votes, y=rating)) + geom_point() + geom_text(aes(label=title))
ggplot(starmovies, aes(x=votes, y=rating)) + geom_point(color='red') +
geom_text(aes(label=title), hjust=0, vjust=0, angle=20) +
xlim(0, max(starmovies$votes)*2) +
ylim(min(starmovies$rating), 9.2)
We only choose the movies with single type to simplify the question.
movietype <- colnames(movies)[18:24] movies1_singletype <- movies1[rowSums(movies1[, movietype]) == 1,] # remove multi-typed movietype_alt <- c(movietype[length(movietype)], movietype[-length(movietype)]) # convert multiple dummies into one factor as grouping var # a little matrix operation will do the trick dummies <- as.matrix(movies1_singletype[, movietype_alt]) movies1_singletype$Type <- factor(dummies %*% (1:length(movietype_alt)), labels=movietype_alt) # Compute the Average Budget of Each Type tapply(movies1_singletype$budget, movies1_singletype$Type, mean)
## Short Action Animation Comedy Drama Documentary ## 396133.1 32698189.6 32311451.6 11921970.6 10456690.5 729704.8 ## Romance ## 5603688.0
The first factor level of movietype, Short, is represented as the intercept term.
lmfit <- lm(as.formula("budget ~ Type"), movies1_singletype)
summary(lmfit)$coef
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 396133.1 1715935 0.2308556 8.174455e-01 ## TypeAction 32302056.5 2062063 15.6649187 7.005190e-53 ## TypeAnimation 31915318.5 4157723 7.6761525 2.316874e-14 ## TypeComedy 11525837.5 1888686 6.1025705 1.202481e-09 ## TypeDrama 10060557.4 1820528 5.5261750 3.604075e-08 ## TypeDocumentary 333571.7 2881175 0.1157763 9.078389e-01 ## TypeRomance 5207554.9 3713295 1.4024082 1.609149e-01
The last predictor, Short is combined into the intercept term.
# mean(movies1_singletype[movies1_singletype$Animation == 1, 'budget'])
lmfit <- lm(as.formula(paste('budget ~', paste(movietype, collapse=' + '))),
movies1_singletype)
summary(lmfit)$coef
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 396133.1 1715935 0.2308556 8.174455e-01 ## Action 32302056.5 2062063 15.6649187 7.005190e-53 ## Animation 31915318.5 4157723 7.6761525 2.316874e-14 ## Comedy 11525837.5 1888686 6.1025705 1.202481e-09 ## Drama 10060557.4 1820528 5.5261750 3.604075e-08 ## Documentary 333571.7 2881175 0.1157763 9.078389e-01 ## Romance 5207554.9 3713295 1.4024082 1.609149e-01
What is the association between cost and rating, conditional on type?
movies1_reg_plot <- ggplot(movies1_singletype, aes(x=budget, y=rating, color=Type)) +
geom_point(shape=1) +
# set fullrange=T will extend the fitted line outside the sample range
stat_smooth(method=lm, se=FALSE, fullrange=FALSE, size=1.5) +
# color is the grouping interface, hence scale_color_*
scale_color_discrete(name='Movie Type: # of samples',
labels=paste(levels(movies1_singletype$Type), ': ',
table(movies1_singletype$Type)))
movies1_reg_plot
Draw a scatter plot of Temprature and RH in Taipei, Taichung, Kaoshiung and Hualien in 2013-05, 2014-05 and 2015-05. Furthermore, each location has its own color and regression line.
Hint: Use following code to read all data and slice the data you want.
source("R/get_weather_all.R")
weather_all <- get_weather_all()
may <- weather_all[weather_all$month == "May",]
ggplot(may, aes(x=RH, y=Temperature, color=location)) +
geom_point(shape=1) +
stat_smooth(method = lm, size = 1.5)
interact_terms <- paste(paste(movietype, '*budget', sep=''), collapse=' + ')
lmfit <- lm(as.formula(paste('rating ~', interact_terms)), movies1_singletype)
tail(summary(lmfit)$coef)
## Estimate Std. Error t value Pr(>|t|) ## Action:budget 1.580218e-08 3.331784e-08 0.47428594 0.6353367 ## budget:Animation 8.246542e-09 3.379426e-08 0.24402199 0.8072334 ## budget:Comedy -6.222262e-10 3.337127e-08 -0.01864557 0.9851253 ## budget:Drama 1.295424e-08 3.333295e-08 0.38863176 0.6975810 ## budget:Documentary -8.504501e-08 9.010076e-08 -0.94388788 0.3453164 ## budget:Romance -3.983138e-08 4.086954e-08 -0.97459824 0.3298521
plot(movies1$budget, movies1$rating) # base solution
abline(h=median(movies1$rating), col='red')
text(x=max(movies1$budget)*.9, y=median(movies1$rating),
labels='Median of Rating', col='red', pos=1)
brggp <- ggplot(movies1, aes(x=budget, y=rating)) + geom_point() brggp + geom_hline(yintercept=median(movies1$rating)) # ?geom_abline for general setup # brggp + geom_hline(data=movies1, aes(yintercept=median(rating))) # the same # brggp + geom_hline(aes(yintercept=median(movies1$rating))) # the same
brggp + geom_hline(yintercept=median(movies1$rating), color='red') +
annotate('text', x=Inf, y=median(movies1$rating),
label='Medaion of Rating', color='red', vjust=1.2, hjust=1)
# Don't use geom_text for single annotation to avoid overplotting
shaw <- movies1[grep('Shawshank Redemption', movies1$title, fixed=TRUE),]
brggp + annotate('segment', xend=shaw$budget, yend=shaw$rating, x=Inf, y=-Inf,
arrow=grid::arrow(), color='red') +
annotate('text', label='The Shawshank Redemption?', x=Inf, y=-Inf,
hjust=1.5, vjust=-1, color='red')
yearcount <- aggregate(title ~ year, data=movies, FUN=length)
ggplot(yearcount, aes(x=year, y=title)) + geom_line() +
annotate('rect', xmin=1990, xmax=2000, ymin=-Inf, ymax=Inf, fill='blue', alpha=.25)
gg <- ggplot(movies1_singletype, aes(x=rating, y=..density..)) + geom_bar() gg + facet_grid(Action ~ .) # Plot with grouping variable in different window (Vertical)
# Plot with grouping variable in different window (Horizontal) gg + facet_grid(. ~ Action)
movies1_singletype$modern <- (movies1_singletype$year > 2000) ggplot(movies1_singletype, aes(x=rating, y=..density..)) + geom_bar() + facet_grid(modern ~ Action)
movies1_singletype$rated <- (movies1_singletype$mpaa != '') ggplot(movies1_singletype, aes(x=rating, color=modern)) + geom_line(stat="density") + facet_grid(Type ~ rated)
facet_gridprint(label_value) # the default labeller plugg in facet_grid(..., labeller)
## function (variable, value) ## as.character(value) ## <environment: namespace:ggplot2>
mylabeller <- function(variable, value){
if ( variable=='rated' )
value <- ifelse(value == TRUE, 'Rated Movies', 'Unrated Movies')
else if ( variable=='Type' )
as.character(value)
}
ggplot(movies1_singletype, aes(x=rating, color=modern)) + geom_line(stat="density") + facet_grid(Type ~ rated, labeller=mylabeller)
gridExtra (1/3)library(gridExtra)
drawPoint <- function(i) {
ggplot(data.frame(x=1, y=1), aes(x=x,y=y)) +
geom_point(shape=i, size=5) +
ggtitle(sprintf('shape=%s',i)) +
theme(axis.text.x=element_blank(), axis.text.y=element_blank()) +
xlab(NULL) + ylab(NULL)
}
drawPoint(25)
gridExtra (2/3)symbol_points <- mapply(drawPoint, 1:25, SIMPLIFY=FALSE) symbols <- do.call(arrangeGrob, symbol_points) symbols
gridExtra (3/3)